# template elements
# presentation

ICJIA R Workshop

Preface

2018-02-13
Bobae Kang
(Bobae.Kang@illinois.gov)

plot of chunk unnamed-chunk-1

plot of chunk unnamed-chunk-2

Source: Pixarbay.com

A brief intro to ...

plot of chunk unnamed-chunk-3

Source: r-project.org

What is R?

“R is a language and environment for statistical computing and graphics.” - The R Foundation

  • Built for data analysis and visualization
  • One of the the most popular choices of programming language among academic researchers and data scientists
  • Open source (free!)
  • Built for statistical analysis
  • Reproducible and transparent
  • Extensible through powerful third-party libraries
  • Enabling researchers to tackle a variety of tasks using a single platform

plot of chunk unnamed-chunk-4

Source: Time Magazine

Data manipulation

plot of chunk unnamed-chunk-5

Source: Wikimedia.org
# peak at the first rows of the data
head(ispcrime_tbl)
# A tibble: 6 x 12
   year county  violentCrime murder  rape robbery aggAssault propertyCrime
  <int> <fct>          <int>  <int> <int>   <int>      <int>         <int>
1  2011 Adams            218      0    37      15        166          1555
2  2011 Alexan~          119      0    14       4        101           290
3  2011 Bond               6      1     0       0          5           211
4  2011 Boone             59      0    24       8         27           733
5  2011 Brown              7      0     1       0          6            38
6  2011 Bureau            42      0     4       3         35           505
# ... with 4 more variables: burglary <int>, larcenyTft <int>,
#   MVTft <int>, arson <int>
# get a quick summary of each column
summary(ispcrime_tbl)
      year            county     violentCrime       murder       
 Min.   :2011   Adams    :  5   Min.   :    0   Min.   :  0.000  
 1st Qu.:2012   Alexander:  5   1st Qu.:   19   1st Qu.:  0.000  
 Median :2013   Bond     :  5   Median :   42   Median :  0.000  
 Mean   :2013   Boone    :  5   Mean   :  501   Mean   :  7.026  
 3rd Qu.:2014   Brown    :  5   3rd Qu.:  133   3rd Qu.:  1.000  
 Max.   :2015   Bureau   :  5   Max.   :33348   Max.   :566.000  
                (Other)  :480   NA's   :7       NA's   :7        
      rape            robbery          aggAssault      propertyCrime   
 Min.   :   0.00   Min.   :    0.0   Min.   :    0.0   Min.   :     0  
 1st Qu.:   1.00   1st Qu.:    0.0   1st Qu.:   15.0   1st Qu.:   133  
 Median :   6.00   Median :    2.0   Median :   33.0   Median :   349  
 Mean   :  41.29   Mean   :  172.3   Mean   :  280.4   Mean   :  2913  
 3rd Qu.:  22.00   3rd Qu.:   13.0   3rd Qu.:  102.0   3rd Qu.:  1190  
 Max.   :1986.00   Max.   :16095.0   Max.   :15129.0   Max.   :178902  
 NA's   :7         NA's   :7         NA's   :7         NA's   :7       
    burglary         larcenyTft           MVTft             arson        
 Min.   :    0.0   Min.   :     0.0   Min.   :    0.0   Min.   :   0.00  
 1st Qu.:   35.5   1st Qu.:    85.5   1st Qu.:    3.0   1st Qu.:   1.00  
 Median :   79.0   Median :   258.0   Median :   10.0   Median :   2.00  
 Mean   :  589.3   Mean   :  2084.9   Mean   :  215.2   Mean   :  23.45  
 3rd Qu.:  268.0   3rd Qu.:   852.0   3rd Qu.:   30.0   3rd Qu.:   8.50  
 Max.   :38485.0   Max.   :116145.0   Max.   :22879.0   Max.   :1418.00  
 NA's   :7         NA's   :7          NA's   :7         NA's   :7        
# filter to keep only counties starting with C for 2015
#   while creating and showing a new variable for total crime count
ispcrime_tbl %>%
  filter(substr(county, 1, 1) == "C", year == 2015) %>%
  mutate(totalCrime = violentCrime + propertyCrime) %>%
  select(year, county, totalCrime)
# A tibble: 12 x 3
    year county     totalCrime
   <int> <fct>           <int>
 1  2015 Calhoun            NA
 2  2015 Carroll           176
 3  2015 Cass              154
 4  2015 Champaign        6486
 5  2015 Christian         292
 6  2015 Clark             103
 7  2015 Clay              191
 8  2015 Clinton           423
 9  2015 Coles             805
10  2015 Cook           153575
11  2015 Crawford          282
12  2015 Cumberland         42
# get annual average count of violent crime by county
ispcrime_tbl %>%
  group_by(county) %>%
  summarise(annualAvgCrime = sum(violentCrime, propertyCrime, na.rm = TRUE) / 5)
# A tibble: 102 x 2
   county    annualAvgCrime
   <fct>              <dbl>
 1 Adams             1724  
 2 Alexander          385  
 3 Bond               190  
 4 Boone              426  
 5 Brown               39.0
 6 Bureau             480  
 7 Calhoun             13.8
 8 Carroll            196  
 9 Cass               109  
10 Champaign         6567  
# ... with 92 more rows
# merging regions data and count the number of rows by region
ispcrime_tbl %>%
  left_join(regions) %>%
  group_by(region) %>%
  count()
# A tibble: 4 x 2
# Groups:   region [4]
  region       n
  <fct>    <int>
1 Central    230
2 Cook         5
3 Northern    85
4 Southern   190

Data visualization

plot of chunk unnamed-chunk-12

Source: Wikimedia.org
# line plot of violent crime trend by region
ggplot(ispcrime_tbl2, aes(x = year, y = violentCrime, color = region)) +
  stat_summary(geom = "line", fun.y = "sum") +
  labs(title = "Violent crime trend by region", x = "Year", y = "Count") +
  theme_classic(base_size = 15)

plot of chunk unnamed-chunk-14

# bar plot of violent crime mean count by region
ggplot(ispcrime_tbl2, aes(x = region, y = violentCrime, fill = region)) +
  stat_summary(geom = "bar", fun.y = "mean") +
  labs(title = "Violent crime count by region", x = "Region", y = "Count") +
  theme_classic(base_size = 15)

plot of chunk unnamed-chunk-15

# histogram of violent crime count by county (excluding Cook)
ggplot(filter(ispcrime_tbl2, county != "Cook"), aes(x = violentCrime)) +
  geom_histogram(binwidth = 100) +
  facet_wrap(~ year) +
  labs(title = "Histogram of violent crime count by county per year",
       x = "Violent crime count", y = "Count") +
  theme_classic(base_size = 15)

plot of chunk unnamed-chunk-16

Statistical modeling

plot of chunk unnamed-chunk-17

Source: pixabay

Example - simple linear model

# simple linear regression with lm()
fit1 <- lm(violentCrime ~ propertyCrime, ispcrime)
summary(fit1)

Call:
lm(formula = violentCrime ~ propertyCrime, data = ispcrime)

Residuals:
    Min      1Q  Median      3Q     Max 
-2239.5    -2.2    57.0    78.3  3992.9 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -79.768287  16.496961  -4.835 1.77e-06 ***
propertyCrime   0.199367   0.001059 188.303  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 363.5 on 501 degrees of freedom
  (7 observations deleted due to missingness)
Multiple R-squared:  0.9861,    Adjusted R-squared:  0.986 
F-statistic: 3.546e+04 on 1 and 501 DF,  p-value: < 2.2e-16
# plot the model fit
plot(violentCrime ~ propertyCrime, ispcrime)
abline(fit1)

plot of chunk unnamed-chunk-19

# show diagnostic plots
par(mfrow=c(2, 2))
plot(fit1)

plot of chunk unnamed-chunk-20

Generalized linear models

# examples of generalized linear models with glm()
logistic_reg <- glm(binary ~ x1 + x2, data = mydata, family = binomial())
poisson_reg <- glm(count ~ x1 + x2, data = mydata, family = poisson())
gamma_reg <- glm(y ~ x1 + x2, data = mydata, family = Gamma())

Other advanced models

  • time series models (e.g. stats and forecast packages)
  • survival models (e.g. survival package)
  • machine learning (e.g. caret and mlr packages)

And more!

plot of chunk unnamed-chunk-22

Reports

  • HTML documents for web publishing
    • create interactive workflow using R Notebook
    • add interactive elements using htmlwidgets and/or shiny
  • PDF documents for printing
  • MS Word documents

Example - R Notebook

plot of chunk unnamed-chunk-23

Slideshow

plot of chunk unnamed-chunk-24

Dashboard

Website

Objectives

plot of chunk unnamed-chunk-27

Technical objectives

  • Import and manipulate tabular data files using R;
  • Create simple data visualizations to extract insight from data using R;
  • Perform basic statistical analysis using R;
  • Generate a report on a simple data analysis task using R

Fundamental objectives

  • Understand the basic elements of the R programming language;
  • Employ the programmatic approach to research and data analysis projects; and
  • Leverage online resources to find solutions to specific questions on using R for a given task.

Structure

plot of chunk unnamed-chunk-28

Overall setup

  • Six modules
  • One module per week
  • Each module consists of two parts
    • except the first module on introduction
  • All workshop materials (slides and notes) will be available
  • I will be available, too, for answering questions

Modules

  1. Introduction to R
  2. R basics
  3. Data analysis in R
  4. Data visualization in R
  5. Statistical modeling in R
  6. Sharing your analysis and more

Questions?

plot of chunk unnamed-chunk-29

Source: Giphy.com